ML Methods

Authors
Affiliation

Mahira Ayub

Boston University

Ava Godsy

Boston University

Joshua Lawrence

Boston University

import os
# Set JAVA_HOME
os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-17'  # Update to your exact path
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"
# Stop any existing Spark sessions
SparkSession.getActiveSession() and SparkSession.getActiveSession().stop()
# Initialize Spark Session with explicit local master
spark = SparkSession.builder \
    .appName("LightcastData") \
    .master("local[*]") \
    .config("spark.driver.host", "localhost") \
    .getOrCreate()
# Load Data
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiLine", "true") \
    .option("escape", "\"") \
    .csv("data/lightcast_job_postings.csv")
# Show Schema and Sample Data
# print("---This is Diagnostic check, No need to print it in the final doc---")
# df.printSchema()
# df.show(5)
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator
from pyspark.ml.stat import Correlation
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Filter for valid salary data
df_clean = df.filter(
    (F.col("SALARY").isNotNull()) & 
    (F.col("SALARY") > 0) &
    (F.col("STATE_NAME").isNotNull()) &
    (F.col("TITLE_NAME").isNotNull())
)

print(f"Original dataset size: {df.count():,}")
print(f"Cleaned dataset size: {df_clean.count():,}")

# Calculate salary statistics
salary_stats = df_clean.select(
    F.mean("SALARY").alias("mean_salary"),
    F.expr("percentile_approx(SALARY, 0.5)").alias("median_salary"),
    F.stddev("SALARY").alias("std_salary"),
    F.min("SALARY").alias("min_salary"),
    F.max("SALARY").alias("max_salary")
).collect()[0]

print(f"\nSalary Statistics:")
print(f"  Mean: ${salary_stats['mean_salary']:,.2f}")
print(f"  Median: ${salary_stats['median_salary']:,.2f}")
print(f"  Std Dev: ${salary_stats['std_salary']:,.2f}")
print(f"  Min: ${salary_stats['min_salary']:,.2f}")
print(f"  Max: ${salary_stats['max_salary']:,.2f}")

# Create binary classification target (above average = 1, below average = 0)
avg_salary = salary_stats['mean_salary']
df_clean = df_clean.withColumn(
    "ABOVE_AVERAGE_SALARY",
    F.when(F.col("SALARY") > avg_salary, 1).otherwise(0)
)
Original dataset size: 72,498
Cleaned dataset size: 30,808

Salary Statistics:
  Mean: $117,953.76
  Median: $116,300.00
  Std Dev: $45,133.88
  Min: $15,860.00
  Max: $500,000.00
# Handle SKILLS_NAME - it may contain arrays or null values
# Extract first skill or mark as "No Skills Listed"
df_clean = df_clean.withColumn(
    "PRIMARY_SKILL",
    F.when(
        F.col("SKILLS_NAME").isNotNull(),
        F.split(F.regexp_replace(F.col("SKILLS_NAME"), r'[\[\]"\n]', ''), ",").getItem(0)
    ).otherwise("No Skills Listed")
)

# Select features and target, and clean empty strings
features_df = df_clean.select(
    "STATE_NAME",
    "TITLE_NAME", 
    "PRIMARY_SKILL",
    "SALARY",
    "ABOVE_AVERAGE_SALARY"
).withColumn(
    "STATE_NAME",
    F.when((F.col("STATE_NAME").isNull()) | (F.trim(F.col("STATE_NAME")) == ""), "Unknown")
     .otherwise(F.trim(F.col("STATE_NAME")))
).withColumn(
    "TITLE_NAME",
    F.when((F.col("TITLE_NAME").isNull()) | (F.trim(F.col("TITLE_NAME")) == ""), "Unknown")
     .otherwise(F.trim(F.col("TITLE_NAME")))
).withColumn(
    "PRIMARY_SKILL",
    F.when((F.col("PRIMARY_SKILL").isNull()) | (F.trim(F.col("PRIMARY_SKILL")) == ""), "No Skills Listed")
     .otherwise(F.trim(F.col("PRIMARY_SKILL")))
)

# Show feature distribution
print("\nTop 10 States by Job Postings:")
features_df.groupBy("STATE_NAME").count().orderBy(F.desc("count")).show(10, truncate=False)

print("\nTop 10 Job Titles by Frequency:")
features_df.groupBy("TITLE_NAME").count().orderBy(F.desc("count")).show(10, truncate=False)

print("\nTop 10 Skills by Frequency:")
features_df.groupBy("PRIMARY_SKILL").count().orderBy(F.desc("count")).show(10, truncate=False)

Top 10 States by Job Postings:
+--------------+-----+
|STATE_NAME    |count|
+--------------+-----+
|California    |3984 |
|Texas         |2544 |
|New York      |1996 |
|Florida       |1504 |
|Virginia      |1347 |
|Illinois      |1271 |
|North Carolina|962  |
|Colorado      |943  |
|Washington    |933  |
|Ohio          |927  |
+--------------+-----+
only showing top 10 rows


Top 10 Job Titles by Frequency:
+-------------------------------+-----+
|TITLE_NAME                     |count|
+-------------------------------+-----+
|Data Analysts                  |3632 |
|Business Intelligence Analysts |991  |
|Unclassified                   |976  |
|Oracle Cloud HCM Consultants   |714  |
|Enterprise Architects          |699  |
|Data Analytics Engineers       |349  |
|Data and Reporting Analysts    |319  |
|Data Governance Analysts       |288  |
|Principal Architects           |227  |
|Enterprise Solutions Architects|225  |
+-------------------------------+-----+
only showing top 10 rows


Top 10 Skills by Frequency:
+--------------------+-----+
|PRIMARY_SKILL       |count|
+--------------------+-----+
|Power BI            |2131 |
|Management          |1489 |
|Customer Service    |1467 |
|Research            |1381 |
|Business Objectives |1257 |
|Detail Oriented     |985  |
|Relational Databases|834  |
|SAP S/4HANA         |820  |
|Salesforce          |670  |
|Presentations       |643  |
+--------------------+-----+
only showing top 10 rows
# String indexing for categorical variables
state_indexer = StringIndexer(inputCol="STATE_NAME", outputCol="STATE_INDEX", handleInvalid="keep")
title_indexer = StringIndexer(inputCol="TITLE_NAME", outputCol="TITLE_INDEX", handleInvalid="keep")
skill_indexer = StringIndexer(inputCol="PRIMARY_SKILL", outputCol="SKILL_INDEX", handleInvalid="keep")

# One-hot encoding
state_encoder = OneHotEncoder(inputCol="STATE_INDEX", outputCol="STATE_VEC")
title_encoder = OneHotEncoder(inputCol="TITLE_INDEX", outputCol="TITLE_VEC")
skill_encoder = OneHotEncoder(inputCol="SKILL_INDEX", outputCol="SKILL_VEC")

# Assemble features
assembler = VectorAssembler(
    inputCols=["STATE_VEC", "TITLE_VEC", "SKILL_VEC"],
    outputCol="features"
)

# Split data (80% train, 20% test)
train_data, test_data = features_df.randomSplit([0.8, 0.2], seed=42)

print(f"Training set size: {train_data.count():,}")
print(f"Test set size: {test_data.count():,}")
Training set size: 24,676
Test set size: 6,132
# Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="SALARY", maxIter=100, regParam=0.1)

# Create pipeline
lr_pipeline = Pipeline(stages=[
    state_indexer, title_indexer, skill_indexer,
    state_encoder, title_encoder, skill_encoder,
    assembler, lr
])

# Train the model
lr_model = lr_pipeline.fit(train_data)

# Make predictions
lr_predictions = lr_model.transform(test_data)

# Evaluate the model
evaluator_rmse = RegressionEvaluator(labelCol="SALARY", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="SALARY", predictionCol="prediction", metricName="r2")
evaluator_mae = RegressionEvaluator(labelCol="SALARY", predictionCol="prediction", metricName="mae")

rmse = evaluator_rmse.evaluate(lr_predictions)
r2 = evaluator_r2.evaluate(lr_predictions)
mae = evaluator_mae.evaluate(lr_predictions)

print(f"\nLinear Regression Model Performance:")
print(f"  RMSE: ${rmse:,.2f}")
print(f"  R² Score: {r2:.4f}")
print(f"  MAE: ${mae:,.2f}")

# Show sample predictions
print("\nSample Predictions:")
lr_predictions.select("STATE_NAME", "TITLE_NAME", "PRIMARY_SKILL", "SALARY", "prediction").show(10, truncate=50)

Linear Regression Model Performance:
  RMSE: $31,213.94
  R² Score: 0.5332
  MAE: $21,417.79

Sample Predictions:
+----------+--------------------------------------------+-------------------+------+------------------+
|STATE_NAME|                                  TITLE_NAME|      PRIMARY_SKILL|SALARY|        prediction|
+----------+--------------------------------------------+-------------------+------+------------------+
|   Alabama|                         Analytics Engineers|        Reliability|134500|134287.38233297397|
|   Alabama|              Business Intelligence Analysts|Business Objectives|105200|  108386.078482199|
|   Alabama|              Business Intelligence Analysts|   Data Warehousing|112097|107321.68487090743|
|   Alabama|              Business Intelligence Analysts|           Power BI| 59582| 98300.68285045939|
|   Alabama|              Business Intelligence Analysts|           Research|125000|   98235.926749922|
|   Alabama|Business Intelligence and Analytics Managers|   Microsoft Access| 97055|105651.29289851415|
|   Alabama|                                 Consultants|      Communication|130500|124869.98049877661|
|   Alabama|                               Data Analysts|           Auditing| 93600| 83656.68287552655|
|   Alabama|                               Data Analysts|     Invoice Review| 80400| 80371.22837983494|
|   Alabama|                               Data Analysts|             Kibana| 95000|  89909.6430482245|
+----------+--------------------------------------------+-------------------+------+------------------+
only showing top 10 rows
# Logistic Regression model
log_reg = LogisticRegression(featuresCol="features", labelCol="ABOVE_AVERAGE_SALARY", maxIter=100)

# Create pipeline
log_pipeline = Pipeline(stages=[
    state_indexer, title_indexer, skill_indexer,
    state_encoder, title_encoder, skill_encoder,
    assembler, log_reg
])

# Train the model
log_model = log_pipeline.fit(train_data)

# Make predictions
log_predictions = log_model.transform(test_data)

# Evaluate classification model
auc_evaluator = BinaryClassificationEvaluator(labelCol="ABOVE_AVERAGE_SALARY", metricName="areaUnderROC")
auc = auc_evaluator.evaluate(log_predictions)

# Calculate accuracy
accuracy = log_predictions.filter(
    F.col("ABOVE_AVERAGE_SALARY") == F.col("prediction")
).count() / log_predictions.count()

print(f"\nLogistic Regression Model Performance:")
print(f"  AUC-ROC: {auc:.4f}")
print(f"  Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Confusion Matrix
print("\nConfusion Matrix:")
confusion_matrix = log_predictions.groupBy("ABOVE_AVERAGE_SALARY", "prediction").count()
confusion_matrix.orderBy("ABOVE_AVERAGE_SALARY", "prediction").show()

Logistic Regression Model Performance:
  AUC-ROC: 0.8779
  Accuracy: 0.8081 (80.81%)

Confusion Matrix:
+--------------------+----------+-----+
|ABOVE_AVERAGE_SALARY|prediction|count|
+--------------------+----------+-----+
|                   0|       0.0| 2665|
|                   0|       1.0|  485|
|                   1|       0.0|  692|
|                   1|       1.0| 2290|
+--------------------+----------+-----+

# Extract the trained linear regression model
trained_lr = lr_model.stages[-1]
coefficients = trained_lr.coefficients
intercept = trained_lr.intercept

print(f"Model Intercept: ${intercept:,.2f}")
print(f"Number of features: {len(coefficients)}")
print(f"Sum of coefficients: {sum(coefficients):,.2f}")
Model Intercept: $117,965.27
Number of features: 4219
Sum of coefficients: -34,383,304.02
import kaleido
pred_data = lr_predictions.select("SALARY", "prediction", "STATE_NAME", "TITLE_NAME").limit(1000).collect()
pred_sample = pd.DataFrame(pred_data, columns=["SALARY", "prediction", "STATE_NAME", "TITLE_NAME"])

fig1 = px.scatter(
    pred_sample, 
    x="SALARY", 
    y="prediction",
    title="Actual vs Predicted Salary (Linear Regression)",
    labels={"SALARY": "Actual Salary ($)", "prediction": "Predicted Salary ($)"},
    opacity=0.6,
    hover_data=["STATE_NAME", "TITLE_NAME"],
    color_discrete_sequence=['#78C2AD']
)
fig1.add_trace(go.Scatter(
    x=[pred_sample["SALARY"].min(), pred_sample["SALARY"].max()],
    y=[pred_sample["SALARY"].min(), pred_sample["SALARY"].max()],
    mode='lines',
    name='Perfect Prediction',
    line=dict(color='red', dash='dash')
))
fig1.update_layout(
    font=dict(
        family="Verdana",
        size=14,
        color="black"
    ),
    title=dict(
        font=dict(
            family="Verdana",
            size=18,
            color="black"
        )
    ),
    xaxis=dict(
        title_font=dict(
            family="Verdana",
            size=14,
            color="black"
        )
    ),
    yaxis=dict(
        title_font=dict(
            family="Verdana",
            size=14,
            color="black"
        )
    )
)
fig1.show()
fig1.write_image("figures/regressionscatter.png")


state_salary_data = df_clean.groupBy("STATE_NAME").agg(
    F.mean("SALARY").alias("avg_salary"),
    F.count("SALARY").alias("job_count")
).orderBy(F.desc("avg_salary")).limit(10).collect()
state_salary = pd.DataFrame(state_salary_data, columns=["STATE_NAME", "avg_salary", "job_count"])

fig2 = px.bar(
    state_salary,
    x="STATE_NAME",
    y="avg_salary",
    title="Top 10 States by Average Salary",
    labels={"STATE_NAME": "State", "avg_salary": "Average Salary ($)"},
    color="avg_salary",
    text="job_count",
    color_continuous_scale=px.colors.sequential.Mint
)
fig2.update_traces(texttemplate='Jobs: %{text}', textposition='outside')
fig2.update_layout(
    font=dict(
        family="Verdana",
        size=14,
        color="black"
    ),
    title=dict(
        font=dict(
            family="Verdana",
            size=18,
            color="black"
        )
    ),
    xaxis=dict(
        title_font=dict(
            family="Verdana",
            size=14,
            color="black"
        )
    ),
    yaxis=dict(
        title_font=dict(
            family="Verdana",
            size=14,
            color="black"
        )
    )
)
fig2.show()
fig2.write_image("figures/top10statesbarplot.png")


salary_class_data = log_predictions.groupBy("ABOVE_AVERAGE_SALARY").count().collect()
salary_class = pd.DataFrame(salary_class_data, columns=["ABOVE_AVERAGE_SALARY", "count"])
salary_class["Category"] = salary_class["ABOVE_AVERAGE_SALARY"].map({0: "Below Average", 1: "Above Average"})

fig3 = px.pie(
    salary_class,
    values="count",
    names="Category",
    title="Distribution of Above/Below Average Salaries",
    color_discrete_sequence=["#78C2AD", "#F3969A"]
)
fig3.update_layout(
    font=dict(
        family="Verdana",
        size=14,
        color="black"
    ),
    title=dict(
        font=dict(
            family="Verdana",
            size=18,
            color="black"
        )
    ),
    xaxis=dict(
        title_font=dict(
            family="Verdana",
            size=14,
            color="black"
        )
    ),
    yaxis=dict(
        title_font=dict(
            family="Verdana",
            size=14,
            color="black"
        )
    )
)
fig3.show()
fig3.write_image("figures/salaryaveragepiechart.png")


title_salary_data = df_clean.groupBy("TITLE_NAME").agg(
    F.mean("SALARY").alias("avg_salary"),
    F.count("SALARY").alias("count")
).filter(F.col("count") > 5).orderBy(F.desc("avg_salary")).limit(15).collect()
title_salary = pd.DataFrame(title_salary_data, columns=["TITLE_NAME", "avg_salary", "count"])

fig4 = px.bar(
    title_salary,
    x="avg_salary",
    y="TITLE_NAME",
    orientation='h',
    title="Top 15 Job Titles by Average Salary (min 5 postings)",
    labels={"TITLE_NAME": "Job Title", "avg_salary": "Average Salary ($)"},
    color="avg_salary",
    color_continuous_scale=px.colors.sequential.Mint
)
fig4.update_layout(
    font=dict(
        family="Verdana",
        size=14,
        color="black"
    ),
    title=dict(
        font=dict(
            family="Verdana",
            size=18,
            color="black"
        )
    ),
    xaxis=dict(
        title_font=dict(
            family="Verdana",
            size=14,
            color="black"
        )
    ),
    yaxis=dict(
        title_font=dict(
            family="Verdana",
            size=14,
            color="black"
        )
    )
)
fig4.show()
fig4.write_image("figures/Top15Bar.png")

predictions_data = lr_predictions.select(
    "STATE_NAME", "TITLE_NAME", "PRIMARY_SKILL", 
    "SALARY", "prediction", "ABOVE_AVERAGE_SALARY"
).limit(10000).collect()  

predictions_pdf = pd.DataFrame(
    predictions_data, 
    columns=["STATE_NAME", "TITLE_NAME", "PRIMARY_SKILL", "SALARY", "prediction", "ABOVE_AVERAGE_SALARY"]
)
predictions_pdf.to_csv("output/salary_predictions.csv", index=False)
print(f"Results saved to: output/salary_predictions.csv ({len(predictions_pdf):,} rows)")
Results saved to: output/salary_predictions.csv (6,132 rows)

Actual vs Predicted Salaries (Linear Regression)

The scatter plot above compares actual versus predicted salaries generated by a linear regression model. Each point represents an observation, while the red dashed line indicates a perfect prediction—where actual and predicted salaries would be equal.

From the visualization, we can observe that the majority of predictions cluster near the perfect-fit line, suggesting that the model captures the general salary trend reasonably well. However, there is noticeable spread at higher salary levels, indicating that the model tends to underpredict high salaries and overpredict some lower ones, reflecting moderate accuracy but potential room for improvement in capturing extreme salary values.

Top 10 States by Average Salary

The bar chart above highlights the top 10 U.S. states by average salary in the dataset. Connecticut leads with the highest average salary, followed closely by Vermont, New Jersey, and Washington. While these states offer strong compensation levels, the number of available jobs varies significantly—from as few as 101 in Vermont to nearly 4,000 in California.

Overall, the visualization suggests that smaller states such as Connecticut and Vermont offer higher average pay but fewer total job opportunities, whereas larger labor markets like California and Virginia have more positions but slightly lower average salaries. This pattern indicates a tradeoff between salary level and job volume across states.

Distribution of Above/Below Average Salaries

The pie chart above displays the distribution of salaries above and below the overall average based on the binary classification assigned. Approximately 51.4% of roles fall below the average salary, while 48.6% are above average, indicating a nearly even split.

This balance suggests that salaries across the dataset are relatively symmetrically distributed, with only a slight tilt toward lower-paying positions. The small difference between the two segments highlights a moderately balanced job market, where compensation levels are fairly evenly dispersed around the mean.

Top 15 Job Titles by Average Salaries

The horizontal bar chart above highlights the top 15 job titles ranked by average salary among roles with at least five postings. The results show that Distinguished Architects and Enterprise Services Managers command the highest average salaries, both exceeding $270K annually. Other top earners include Directors of Enterprise Architecture and Advisory Solution Consultants, which also sit well above the $200K threshold.

Overall, the visualization indicates that executive-level and architecture-focused roles tend to yield the highest compensation, reflecting the premium placed on leadership, system design, and strategic technical expertise in today’s data and cloud-driven job market.

import pandas as pd
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Load data
df = pd.read_csv("lightcast_job_postings.csv")

# Keep relevant columns
cols = ['STATE_NAME', 'TITLE_NAME', 'SPECIALIZED_SKILLS_NAME', 'SALARY_FROM', 'SALARY_TO']
df = df[cols].dropna(subset=['SALARY_FROM', 'SALARY_TO'])

# Compute average salary
df['AVERAGE_SALARY'] = (df['SALARY_FROM'] + df['SALARY_TO']) / 2

# Create AI classification flag based on job title or skills
ai_keywords = ['AI', 'Artificial Intelligence', 'Machine Learning', 'ML', 'Deep Learning', 
               'Data Scientist', 'Neural', 'Computer Vision', 'NLP']

def classify_ai(row):
    text = f"{row['TITLE_NAME']} {row['SPECIALIZED_SKILLS_NAME']}".upper()
    return 'AI' if any(word in text for word in ai_keywords) else 'Non-AI'

df['AI_CLASSIFICATION'] = df.apply(classify_ai, axis=1)

# Aggregate salary by region and AI classification
salary_summary = (
    df.groupby(['STATE_NAME', 'AI_CLASSIFICATION'])
      .agg(MEAN_SALARY=('AVERAGE_SALARY', 'mean'),
           COUNT=('AVERAGE_SALARY', 'count'))
      .reset_index()
)

# Plot salary comparison by state
fig = px.bar(
    salary_summary,
    x='STATE_NAME',
    y='MEAN_SALARY',
    color='AI_CLASSIFICATION',
    barmode='group',
    text_auto='.2s',
    color_discrete_map={'AI': "#714E50", 'Non-AI': '#297C8A'},
    title="Average Salary Comparison by State and AI Classification"
)

fig.update_layout(
    xaxis_title="State",
    yaxis_title="Average Salary ($)",
    legend_title="Job Type",
    template="plotly_white",
    font=dict(family="Roboto", size=14),
    xaxis={'categoryorder': 'total descending'}
)

fig.show()

# Regression model – Predict salary using AI classification and state
le_state = LabelEncoder()
le_ai = LabelEncoder()

df['STATE_ENCODED'] = le_state.fit_transform(df['STATE_NAME'])
df['AI_FLAG'] = le_ai.fit_transform(df['AI_CLASSIFICATION'])

X = df[['STATE_ENCODED', 'AI_FLAG']]
y = df['AVERAGE_SALARY']

model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

# 8Model evaluation
r2 = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))

print("R² Score:", round(r2, 3))
print("RMSE:", round(rmse, 2))
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[1], line 9
      6 import numpy as np
      8 # Load data
----> 9 df = pd.read_csv("lightcast_job_postings.csv")
     11 # Keep relevant columns
     12 cols = ['STATE_NAME', 'TITLE_NAME', 'SPECIALIZED_SKILLS_NAME', 'SALARY_FROM', 'SALARY_TO']

File c:\Users\jt-la\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\io\parsers\readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File c:\Users\jt-la\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\io\parsers\readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File c:\Users\jt-la\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\io\parsers\readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File c:\Users\jt-la\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\io\parsers\readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File c:\Users\jt-la\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\io\common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    868 elif isinstance(handle, str):
    869     # Check whether the filename is to be opened in binary mode.
    870     # Binary mode does not support 'encoding' and 'newline'.
    871     if ioargs.encoding and "b" not in ioargs.mode:
    872         # Encoding
--> 873         handle = open(
    874             handle,
    875             ioargs.mode,
    876             encoding=ioargs.encoding,
    877             errors=errors,
    878             newline="",
    879         )
    880     else:
    881         # Binary mode
    882         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: 'lightcast_job_postings.csv'

The bar chart shown above compares average salaries across U.S. states and shows jobs split between AI and non-AI roles. By looking at the graph we can see that AI-related jobs have higher average salaries than non-AI jobs. Some States like New Jersey, Connecticut, Montana and Arkansas have the highest salaries for AI roles. By looking at the overall data we can determine that AI expertise can boost earning potential nationwide.

# Import libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import plotly.graph_objects as go

# --------------------------------------------------
# 1. Load and clean the dataset
# --------------------------------------------------
df_clean = pd.read_csv("lightcast_job_postings.csv")  

# Convert salary-related columns to numeric (replace with actual salary column name)
# Try to find which column represents salary (SALARY, SALARY_FROM, AVERAGE_SALARY, etc.)
salary_cols = [col for col in df_clean.columns if 'salary' in col.lower()]
print("Salary-related columns found:", salary_cols)

# Choose one salary column
salary_col = salary_cols[0]  # use the first match (adjust manually if needed)

df_clean[salary_col] = pd.to_numeric(df_clean[salary_col], errors='coerce')

# Drop rows where salary or state name is missing
df_clean = df_clean.dropna(subset=[salary_col, 'STATE_NAME'])

# --------------------------------------------------
# 2. Compute average salary per state
# --------------------------------------------------
state_salary = df_clean.groupby('STATE_NAME')[salary_col].mean().reset_index()
state_salary.rename(columns={salary_col: 'AVERAGE_SALARY'}, inplace=True)

# --------------------------------------------------
# 3. Prepare data for clustering
# --------------------------------------------------
X = state_salary[['AVERAGE_SALARY']].copy()
# Add a numeric index for plotting purposes
state_salary['STATE_INDEX'] = np.arange(len(state_salary))

# --------------------------------------------------
# 4. Run KMeans clustering
# --------------------------------------------------
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
state_salary['CLUSTER'] = kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_

# --------------------------------------------------
# 5. Visualization with Plotly
# --------------------------------------------------
colors = ['#3D7C6A', '#B14E53', '#297C8A']

fig = go.Figure()

# Add points for each cluster
for i in range(3):
    cluster_data = state_salary[state_salary['CLUSTER'] == i]
    fig.add_trace(go.Scatter(
        x=cluster_data['STATE_INDEX'],
        y=cluster_data['AVERAGE_SALARY'],
        mode='markers+text',
        name=f'Cluster {i+1}',
        text=cluster_data['STATE_NAME'],
        textposition="top center",
        marker=dict(color=colors[i], size=10, opacity=0.7),
        hovertemplate=(
            'State: %{text}<br>'
            'Average Salary: $%{y:,.0f}<br>'
            'Cluster: ' + str(i+1) + '<extra></extra>'
        )
    ))

# Add centroid marker
fig.add_trace(go.Scatter(
    x=np.arange(3),
    y=centroids.flatten(),
    mode='markers',
    name='Centroids',
    marker=dict(color='black', size=15, symbol='x', line=dict(width=2)),
    hovertemplate='Centroid Salary: $%{y:,.0f}<extra></extra>'
))

# --------------------------------------------------
# 6. Style the layout
# --------------------------------------------------
fig.update_layout(
    title=dict(text='KMeans Clustering by Average Salary per State', font=dict(size=18)),
    xaxis=dict(title='State Index (for plotting only)', tickfont=dict(size=12)),
    yaxis=dict(title='Average Salary ($)', tickfont=dict(size=12)),
    font=dict(family='Verdana', size=14),
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    hovermode='closest'
)

fig.show()
C:\Users\mahir\AppData\Local\Temp\ipykernel_6604\2145399916.py:10: DtypeWarning:

Columns (19,30) have mixed types. Specify dtype option on import or set low_memory=False.
Salary-related columns found: ['SALARY', 'SALARY_TO', 'SALARY_FROM']
Unable to display output for mime type(s): application/vnd.plotly.v1+json

The KMeans clustering graph groups the U.S. States into three clusters based on their average salaries. On the y-axis we have average salary level and on the x-axis we have the State index (used for spacing). Each dot shown on the graph represents a State.

The three clusters illustrate which states have higher-than-average, mid-range, and lower average salaries.

  • High paying cluster (Cluster 1-Green): This cluster includes States like NJ, MA,CT, and CA which have the highest average salaries and tend to have high living costs and major economic hubs.
  • Mid salary cluster (Cluster 2-Red): This cluster includes States like OH,FL, LA and NH where salaries are moderate and reflects a balanced labor market.
  • Lower salary cluster (Cluster 3-Teal): This cluster includes States like NV, NM, ND and WV who have low living costs and small urban centers.

The centroids the black Xs on the graph summarize the average salary levels of each cluster. The top centroid shows high income states. The middle centroid shows average income states and the bottom centroid shows low income states. These centroids helps us to interpret and visualize income-level groupings across different U.S. states.